/* sHT -- find lexeme boundaries
   Copyright (C) 2019 Ariadne Devos

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>. */

#include <sHT/bitvec.h>
#include <sHT/index.h>
#include <sHT/lex.h>
#include <sHT/string.h>
#include <sHT/taint.h>
#include <sHT/test.h>

size_t
sHT_lex(struct sHT_lex_buf *to, const unsigned char from[], size_t n, const struct sHT_lex_type *c)
{
	/* The current offset into @var{from}. (Set later). */
	size_t i;
	/* The old number of accumulated bytes,
	   therefore, the index into @var{to->bytes} to start writing to. */
	size_t offset = to->offset;
	/* The following loop: iterate over the bytes of @var{from}, to
	  validate their syntax and copy them -- and process a fragment when
	  complete. However, not all bytes, as only @code{c->max_known} are
	  allocated.

	  @var{offset}: data from previous runs is remembered. */
	/* Underflow 1: @var{sHT_lex_buf} invariant.
	  Bounds 0 -> @var{sHT_parser} precondition.
	  Bounds 1 -> type widths in @var{c} and @var{to} */
	size_t todo = sHT_min_size(n, c->max_known - offset);
	/* Induct over byte locations, until a space character, a syntax error
	  or the method is found to be too long to be known. */
	/* (1) todo <= @var{n} (@var{sHT_min_size}),
	  (2) n < SSIZE_MAX,
	  (1, 2) => todo < SSIZE_MAX.
	  QED @var{sHT_index_iterate} max bounds.

	  (1) @var{n} != 0 (precondition)
	  (2) offset < max_known
	  (2) => (3) 0 < max_known - offset
	  (1, 3): QED @var{sHT_index_iterate} positivity. */
	/* Invariant: byte offset to offset + i (exclusive) of
	  @code{buf->bytes} are set. Base case: trivial. */
	sHT_index_iterate(i, todo) {
		/* If zero @var{n} were allowed, this would be out of bounds */
		/* (1) i < todo (@var{sHT_index_iterate})
		  (2) todo <= n (@var{sHT_min_size})
		  (1, 2) => i < n
		  QED @var{from} length */
		uint8_t b = from[i];
		/* (1) i < todo (@var{sHT_index_iterate})
		  (2) todo <= max_known - offset (@var{sHT_min_size})
		  (1, 2) => (3) i < max_known - offset
		  (3) => offset + i < max_known
		  QED @var{to} capacity.

		  QED induction step (is set). */
		to->bytes[offset + i] = b;
		if (sHT_bit_test(c->c_allow, b)) {
			/* Correct byte, but not a terminator.
			  Continue the search. */
			continue;
		}
		/* Non-speculatively, @var{b} is not one of the allowed
		  bytes. Either it is the terminator, or a syntax error.
		  Which one? (0: syntax error, 1: terminator)*/
		int which = sHT_eq_bool(c->c_stop, b);
		/* Not used anymore; taint for analysis */
		sHT_taint(&to->offset);
		/* +1: also count the terminating byte
		  (<tests/lex.c> found this bug)

		  (1) i < todo (@var{sHT_index_iterate}),
		  (2) todo <= n (@var{sHT_index_iterate})
		  (1, 2) => (3) i < n.
		  (3) => (4) i + 1 <= n
		  QED bounds last argument

		  (1) i < todo (@var{sHT_index_iterate}),
		  (2) todo <= max_known - offset (@var{sHT_min_size})
		  (1, 2) => (3) i <= max_known - offset
		  (3) => (4) offset + i <= max_known

		  QED length/index bounds */
		return c->cb_value[which](to, to->bytes, offset + i, i + 1);
	}

	/* Compare the number of running total of tested bytes with the
	  maximal known lexeme length. If it the former begins to equal
	  the latter, there is no point in copying anymore, but the
	  syntax must still be validated. */
	/* Overflow:

	  (1) i < todo (@var{sHT_index_iterate}),
	  (2) todo <= max_known - offset (@var{sHT_min_size})
	  (1, 2) => (3) i <= max_known - offset
	  (3) => (4) offset + i <= max_known
	  (5) max_known < SSIZE_MAX (@var{sHT_lex_type})
	  (4, 5) => offset + i < SSIZE_MAX

	  QED no overflow */
	if (sHT_ge(offset + i, c->max_known)) {
		/* Not used anymore; taint for analysis */
		sHT_taint(&to->offset);
		sHT_taint(&to->bytes[0]);
		return c->cb_ignore(to, i);
	}

	/* More bytes must be read before the lexeme is complete.
	  Proof of progress (i = n) (non-speculatively):

	  (1) offset + i < max_known (@var{sHT_ge})
	  (2) i = todo (@var{sHT_index_iterate})
	  (1) => (4) i < max_known - offset
	  (2, 4) => (5) todo < max_known - offset
	  (5) => todo = n (@var{sHT_min_size})
	  (2, 5) => i = n

	  QED progress */
	/* Remember the number of copied bytes */
 	/* Overflow/bounds:

	  (1) i <= todo (@var{sHT_index_iterate}),
	  (2) todo <= max_known - offset (@var{sHT_min_size})
	  (1, 2) => (3) i <= max_known - offset
	  (3) => (4) offset + i <= max_known
	  QED bounds; continue overflow

	  (5) max_known < UINT16_MAX (@var{uint16_t})
	  (4, 5) => offset + i < UINT16_MAX

	  QED overflow */
	to->offset += i;
	/* Bounds:

	  (1) i <= todo (@var{sHT_index_iterate})
	  (2) todo <= n (@var{sHT_min_size})
	  (1, 2) => i <= n

	  QED bounds */
	return i;
}
